import torch
import matplotlib.pyplot as plt
from IPython.display import Audio
from utils import plot_spectrogram
#TODO: changeme
%cd /Users/janne/git/tutorial/codes
# from codes.data_loader import GTZANLoader
/Users/janne/git/tutorial/codes
Audio Data Augmentations¶
In this chapter, we will discuss common transformations that we can apply to audio signals. We will refer to these as “audio data augmentations”.
Data augmentations are a set of methods that add modified copies to a dataset, from the existing data. This process creates many variations of natural data, and can act as a regulariser to reduce the problem of overfitting. It can also help deep neural networks become robust to complex variations of natural data, which improves their generalisation performance.
In the field of computer vision, the transformations that we apply to images are often very self-explanatory. Take this image, for example. It becomes fairly obvious that we have applied various amounts of gaussian blurring on this image.

Naturally, we cannot translate transformations from the vision domain directly to the audio domain. Before we explore a battery of audio data augmentations, we now list the currently available code libraries:
Code Libraries¶
Name |
Author |
Framework |
Language |
License |
Link |
|---|---|---|---|---|---|
Muda |
B. McFee et al. (2015) |
General Purpose |
Python |
ISC License |
|
Audio Degradation Toolbox |
M. Mauch et al. (2013) |
General Purpose |
MATLAB |
GNU General Public License 2.0 |
|
rubberband |
- |
General Purpose |
C++ |
GNU General Public License (non-commercial) |
|
audiomentations |
I. Jordal (2021) |
General Purpose |
Python |
MIT License |
|
tensorflow-io |
tensorflow.org |
TensorFlow |
Python |
Apache 2.0 License |
|
torchaudio |
pytorch.org |
PyTorch |
Python |
BSD 2-Clause “Simplified” License |
|
torch-audiomentations |
Asteroid (2021) |
PyTorch |
Python |
MIT License |
|
torchaudio-augmentations |
J. Spijkervet (2021) |
PyTorch |
Python |
MIT License |
Listening¶
One of the most essential, and yet overlooked, parts of music research is exploring and observing the data. This also applies to data augmentation research: one has to develop a general understanding of the effect of transformations that can be applied to audio. Even more so, when transformations are applied sequentially.
For instance, we will understand why a reverb applied before a frequency filter will sound different than when the reverb is applied after the frequency filter. Before we develop this intuition, let’s listen to a series of audio data augmenations.
from torchaudio.datasets import GTZAN
dataset = GTZAN(root=".", download=True)
print(len(dataset))
audio, sr, genre = dataset[5]
print(f"Genre: {genre}\nSample rate: {sr}\nChannels: {audio.shape[0]}\nSamples: {audio.shape[1]}")
display(Audio(audio, rate=sr))
Augmentation Modules¶
from torchaudio_augmentations import Compose, ComposeMany
from torchaudio_augmentations import (
Delay,
Gain,
HighLowPass,
Noise,
PitchShift,
PolarityInversion,
RandomResizedCrop,
Reverb,
)
import math
l = 1/440.0
test_audio = torch.sin(math.tau * 440.0 * torch.linspace(0, l, int(l*sr))).unsqueeze(0)
plt.plot(test_audio.squeeze(0))
plt.show()
inverted_test_audio = PolarityInversion()(test_audio)
plt.plot(inverted_test_audio.squeeze(0))
plt.show()
Compose Module¶
transform = Compose([
Delay(
sample_rate=sr,
volume_factor=0.5,
min_delay=100,
max_delay=500,
delay_interval=1,
)])
print("Transform:", transform)
transformed_audio = transform(audio)
display(Audio(audio, rate=sr))
display(Audio(transformed_audio, rate=sr))
Transform: Compose(
Delay()
)
Stack audio augmentations¶
# 4 seconds of audio
num_samples = sr * 4
transforms = [
RandomResizedCrop(n_samples=num_samples),
HighLowPass(
sample_rate=sr,
lowpass_freq_low=2200,
lowpass_freq_high=4000,
highpass_freq_low=200,
highpass_freq_high=1200,
),
Delay(
sample_rate=sr,
volume_factor=0.5,
min_delay=100,
max_delay=500,
delay_interval=1,
)
]
transform = Compose(transforms)
print("Transform:", transform)
transformed_audio = transform(audio)
display(Audio(transformed_audio, rate=sr))
Transform: Compose(
RandomResizedCrop()
HighLowPass()
Delay()
)
Return multiple augmented samples¶
# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4
transform = ComposeMany(transforms, num_augmented_samples=num_augmented_samples)
print("Transform:", transform)
transformed_audio = transform(audio)
for ta in transformed_audio:
plot_spectrogram(ta, sr, title="")
display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
RandomResizedCrop()
HighLowPass()
Delay()
)
Stochastic Audio Data Augmentations¶
transforms = [
PolarityInversion(),
PitchShift(sample_rate=sr, n_samples=audio.shape[1]),
Reverb(sample_rate=sr)
]
stochastic_transforms = [
RandomApply(transforms, p=0.5)
]
transform = Compose(stochastic_transforms)
print(transform)
transformed_audio = transform(audio)
display(Audio(transformed_audio, rate=sr))
Audio chain stochastic augmentations¶
from torchaudio_augmentations import RandomApply
# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4
# 4 seconds of audio
num_samples = sr * 4
stochastic_transforms = [
RandomResizedCrop(n_samples=num_samples),
# apply with p = 0.3
RandomApply([
PolarityInversion(),
HighLowPass(
sample_rate=sr,
lowpass_freq_low=2200,
lowpass_freq_high=4000,
highpass_freq_low=200,
highpass_freq_high=1200,
),
Delay(
sample_rate=sr,
volume_factor=0.5,
min_delay=100,
max_delay=500,
delay_interval=1,
),
],
p=0.3),
# apply with p = 0.8
RandomApply([
PitchShift(sample_rate=sr, n_samples=num_samples),
Gain(),
Noise(max_snr=0.01),
Reverb(sample_rate=sr)
],
p=0.8)
]
transform = ComposeMany(stochastic_transforms, num_augmented_samples=num_augmented_samples)
print("Transform:", transform)
for ta in transformed_audio:
display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
RandomResizedCrop()
RandomApply(
p=0.3
PolarityInversion()
HighLowPass()
Delay()
)
RandomApply(
p=0.8
<torchaudio_augmentations.augmentations.pitch_shift.PitchShift object at 0x7fda773b7d60>
Gain()
Noise()
Reverb()
)
)
Single stochastic augmentations¶
# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4
# 4 seconds of audio
num_samples = sr * 4
# define our stochastic augmentations
transforms = [
RandomResizedCrop(n_samples=num_samples),
RandomApply([PolarityInversion()], p=0.8),
RandomApply([HighLowPass(sample_rate=sr)], p=0.6),
RandomApply([Delay(sample_rate=sr)], p=0.6),
RandomApply([PitchShift(sample_rate=sr, n_samples=num_samples)], p=0.3),
RandomApply([Gain()], p=0.6),
RandomApply([Noise(max_snr=0.01)], p=0.3),
RandomApply([Reverb(sample_rate=sr)], p=0.5)
]
transform = ComposeMany(transforms, num_augmented_samples=num_augmented_samples)
print("Transform:", transform)
transformed_audio = transform(audio)
for ta in transformed_audio:
plot_spectrogram(ta, sr, title=e="")
display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
RandomResizedCrop()
RandomApply(
p=0.8
PolarityInversion()
)
RandomApply(
p=0.6
HighLowPass()
)
RandomApply(
p=0.6
Delay()
)
RandomApply(
p=0.3
<torchaudio_augmentations.augmentations.pitch_shift.PitchShift object at 0x7fda7980f1f0>
)
RandomApply(
p=0.6
Gain()
)
RandomApply(
p=0.3
Noise()
)
RandomApply(
p=0.5
Reverb()
)
)